In [ ]:
import os
import pandas as pd
from datetime import datetime
import geopy.distance
import matplotlib.pyplot as plt
import pylab as pl
from statsmodels.tsa.seasonal import seasonal_decompose
import pymongo
import plotly.express as px
In [2]:
# Vient lire les dossiers présents dans "DATA" où chaque dossier correspond à un User
# On prend chacun de ces users et on crée un DataFrame avec l'ID du user, son path, et savoir si le dossier contient le fichier labels.txt

DATA_PATH = "../data"
users = []
for user in os.listdir(DATA_PATH):
    user_path = DATA_PATH+"/"+user
    has_label = "labels.txt" in os.listdir(user_path)
    users.append([int(user),user_path,int(has_label)])

df_users = pd.DataFrame(users,columns=["user_id","user_path","has_label"])
df_users
Out[2]:
user_id user_path has_label
0 0 ../data/000 0
1 1 ../data/001 0
2 2 ../data/002 0
3 3 ../data/003 0
4 4 ../data/004 0
... ... ... ...
177 177 ../data/177 0
178 178 ../data/178 0
179 179 ../data/179 1
180 180 ../data/180 0
181 181 ../data/181 0

182 rows × 3 columns

In [3]:
# Première idée de DataFrame

with open("../data/010/Trajectory/20070804033032.plt") as f:
  trajectory_test = [i.split(",") for i in f.read().split("\n")[6:]]
headers = ["Latitude","Longitude","Altitude","Date","Horaire"]
df_test = pd.DataFrame(trajectory_test)
df_test = df_test.drop([2, 4], axis=1).dropna()
df_test.columns = headers
print(df_test)
       Latitude   Longitude Altitude        Date   Horaire
0     39.921712  116.472343       13  2007-08-04  03:30:32
1     39.921705  116.472343       13  2007-08-04  03:30:33
2     39.921695  116.472345       13  2007-08-04  03:30:34
3     39.921683  116.472342       13  2007-08-04  03:30:35
4     39.921672  116.472342       13  2007-08-04  03:30:36
...         ...         ...      ...         ...       ...
1111  39.902912  116.421455      180  2007-08-04  04:14:32
1112  39.902908  116.421432      180  2007-08-04  04:14:33
1113  39.902903  116.421413      180  2007-08-04  04:14:35
1114  39.902892   116.42133      180  2007-08-04  04:14:45
1115  39.902885    116.4213      180  2007-08-04  04:14:46

[1116 rows x 5 columns]
In [4]:
# Fonction pour récupérer la date de départ de chaque trajet

def date_depart(file):
    with open(file) as f:
        data = f.readlines()[6]
        buff_data = data.replace("\n","").split(",")
        date = buff_data[5]
    return date
In [5]:
date_depart("../data/010/Trajectory/20070804033032.plt")
Out[5]:
'2007-08-04'
In [6]:
# Fonction pour récupérer les coordonnées du point de départ

def coord_depart(file):
    with open(file) as f:
        data = f.readlines()[6]
        buff_data = data.replace("\n","").split(",")
    depart_coord = [buff_data[0], buff_data[1]]

    return depart_coord
In [7]:
# Fonction pour récupérer l'horaire du point de départ

def horaire_depart(file):
    with open(file) as f:
        data = f.readlines()[6]
        buff_data = data.replace("\n","").split(",")
    depart_horaire = buff_data[5] + "/" + buff_data[6]

    return depart_horaire
In [8]:
# Fonction pour récupérer les coordonnées du point d'arrivée

def coord_arrive(file):
    with open(file, 'r') as f:
        data = f.readlines()[-1]
        buff_data = data.replace("\n","").split(",")
    arrive_coord = [buff_data[0], buff_data[1]]

    return arrive_coord
In [9]:
# Fonction pour récupérer l'horaire du point d'arrivée

def horaire_arrive(file):
    with open(file, 'r') as f:
        data = f.readlines()[-1]
        buff_data = data.replace("\n","").split(",")
    arrive_horaire = buff_data[5] + "/" + buff_data[6]

    return arrive_horaire
In [10]:
# Fonction pour calculer le temps de trajet

def temps(tps_depart, tps_arrive):

    ## Calcul du temps
    format_data = "%Y-%m-%d/%H:%M:%S"
    depart = datetime.strptime(tps_depart, format_data)
    arrive = datetime.strptime(tps_arrive, format_data)

    time_delta = arrive - depart
    return time_delta
In [11]:
# Fonction pour calculer la distance d'un trajet

def distance(file):
    with open(file, 'r') as f:
        distance = 0
        lines = f.readlines()[6:]

        for i in range(len(lines)-2):
            buff_line = lines[i].split(",")
            buff_line2 = lines[i+1].split(",")
            coords_1 = (buff_line[0], buff_line[1])
            coords_2 = (buff_line2[0], buff_line2[1])
            distance += geopy.distance.geodesic(coords_1, coords_2).km

    return distance
In [12]:
# Test avec les 10 premiers trajets de User10

buffer_file = os.listdir("../data/010/Trajectory/")
tab_file_u40 = buffer_file[:40]
tab_file_u40
Out[12]:
['20070804033032.plt',
 '20070804155303.plt',
 '20070805070503.plt',
 '20070828171302.plt',
 '20070830203928.plt',
 '20070901022340.plt',
 '20070903095208.plt',
 '20070905163053.plt',
 '20070906204521.plt',
 '20070907075003.plt',
 '20070908081710.plt',
 '20070910074631.plt',
 '20070910204430.plt',
 '20070919121546.plt',
 '20070919122147.plt',
 '20070920074804.plt',
 '20070921120306.plt',
 '20070922145432.plt',
 '20070923144426.plt',
 '20071014161216.plt',
 '20071017220238.plt',
 '20071019055858.plt',
 '20071020030002.plt',
 '20071021110759.plt',
 '20071023200800.plt',
 '20071024084732.plt',
 '20071025110456.plt',
 '20071026213100.plt',
 '20071026214019.plt',
 '20071029075850.plt',
 '20071117170827.plt',
 '20071214175547.plt',
 '20071228075610.plt',
 '20071228190036.plt',
 '20071229083859.plt',
 '20071230073845.plt',
 '20071231060928.plt',
 '20071231130809.plt',
 '20071231170243.plt',
 '20080328144824.plt']
In [13]:
%%time
final_tab2 = []
headers2 = ["USER_ID","TRAJET_ID","DATE", "DEPART","ARRIVE","DISTANCE", "TEMPS", "TYPE-TRANSPORT"]
for file in buffer_file:
    buff_file = "../data/010/Trajectory/" + file
    buff_date = date_depart(buff_file)
    buff_depart = coord_depart(buff_file)
    buff_arrive = coord_arrive(buff_file)
    buff_distance = distance(buff_file)
    buff_tps_depart = horaire_depart(buff_file)
    buff_tps_arrive = horaire_arrive(buff_file)
    buff_temps = temps(buff_tps_depart, buff_tps_arrive)
    transport = 'velo'

    final_tab2.append(['10', file, buff_date, buff_depart, buff_arrive, buff_distance, buff_temps, transport])

df_test2 = pd.DataFrame(final_tab2)
df_test2.columns = headers2

# Trie du dataframe en fonction de la colonne [Date]
df_test2 = df_test2.sort_values(by = 'DATE')
df_test2
CPU times: total: 1min 26s
Wall time: 2min 7s
Out[13]:
USER_ID TRAJET_ID DATE DEPART ARRIVE DISTANCE TEMPS TYPE-TRANSPORT
0 10 20070804033032.plt 2007-08-04 [39.921712, 116.472343] [39.902885, 116.4213] 7.579780 0 days 00:44:14 velo
1 10 20070804155303.plt 2007-08-04 [42.017857, 123.506235] [42.258245, 123.790855] 36.828315 0 days 00:19:49 velo
2 10 20070805070503.plt 2007-08-05 [44.589055, 129.603843] [44.180203, 125.49309] 521.588151 0 days 10:08:07 velo
3 10 20070828171302.plt 2007-08-28 [39.900917, 116.420018] [39.118588, 117.24275] 141.574358 0 days 01:08:41 velo
4 10 20070830203928.plt 2007-08-30 [39.12299, 117.244615] [39.135748, 117.219655] 3.740095 0 days 00:17:29 velo
... ... ... ... ... ... ... ... ...
156 10 20090227210324.plt 2009-02-27 [39.991438, 116.329449] [39.137058, 117.219798] 136.516925 0 days 02:27:11 velo
157 10 20090301131323.plt 2009-03-01 [39.136643, 117.218026] [39.991345, 116.32791] 135.217493 0 days 02:01:29 velo
158 10 20090307044707.plt 2009-03-07 [39.991198, 116.330976] [39.992358, 116.325195] 136.453900 0 days 05:24:23 velo
159 10 20090315093133.plt 2009-03-15 [39.994676, 116.326561] [39.992513, 116.326305] 41.408222 0 days 04:31:43 velo
160 10 20090321032156.plt 2009-03-21 [39.9921, 116.331613] [39.136253, 117.21831] 132.865119 0 days 02:12:57 velo

161 rows × 8 columns

In [14]:
# L'index devient la DATE du trajet afin de faire des stats dessus
df_test2.DATE = pd.to_datetime(df_test2.DATE)
df_test2.set_index('DATE', inplace=True)
df_test2
Out[14]:
USER_ID TRAJET_ID DEPART ARRIVE DISTANCE TEMPS TYPE-TRANSPORT
DATE
2007-08-04 10 20070804033032.plt [39.921712, 116.472343] [39.902885, 116.4213] 7.579780 0 days 00:44:14 velo
2007-08-04 10 20070804155303.plt [42.017857, 123.506235] [42.258245, 123.790855] 36.828315 0 days 00:19:49 velo
2007-08-05 10 20070805070503.plt [44.589055, 129.603843] [44.180203, 125.49309] 521.588151 0 days 10:08:07 velo
2007-08-28 10 20070828171302.plt [39.900917, 116.420018] [39.118588, 117.24275] 141.574358 0 days 01:08:41 velo
2007-08-30 10 20070830203928.plt [39.12299, 117.244615] [39.135748, 117.219655] 3.740095 0 days 00:17:29 velo
... ... ... ... ... ... ... ...
2009-02-27 10 20090227210324.plt [39.991438, 116.329449] [39.137058, 117.219798] 136.516925 0 days 02:27:11 velo
2009-03-01 10 20090301131323.plt [39.136643, 117.218026] [39.991345, 116.32791] 135.217493 0 days 02:01:29 velo
2009-03-07 10 20090307044707.plt [39.991198, 116.330976] [39.992358, 116.325195] 136.453900 0 days 05:24:23 velo
2009-03-15 10 20090315093133.plt [39.994676, 116.326561] [39.992513, 116.326305] 41.408222 0 days 04:31:43 velo
2009-03-21 10 20090321032156.plt [39.9921, 116.331613] [39.136253, 117.21831] 132.865119 0 days 02:12:57 velo

161 rows × 7 columns

In [15]:
# PLOT DE TOUS LES DEPLACEMENT DE L'USER
df_test2.loc['2007': '2009', 'DISTANCE'].plot()
Out[15]:
<AxesSubplot:xlabel='DATE'>
In [16]:
# PLOT ENTRE 2008-09 ET 2008-10
df_test2.loc['2008-09': '2008-10', 'DISTANCE'].plot()
Out[16]:
<AxesSubplot:xlabel='DATE'>
In [17]:
# Regrouper les données par semaine
# Y = Year / M = Month / W = Week / D = Day / h = Hour / m = minute
df_test2.loc['2008-09': '2008-10', 'DISTANCE'].resample('W').plot()
plt.show()
In [18]:
# Regrouper les données par semaine
# Y = Year / M = Month / W = Week / D = Day / h = Hour / m = minute
# Ici cas dans une semaine
df_test2.loc['2008-10-01': '2008-10-07', 'DISTANCE'].plot()
plt.show()
In [19]:
# Regrouper les données par semaine
# Y = Year / M = Month / W = Week / D = Day / h = Hour / m = minute
# Ici cas dans une semaine
df_test2.loc['2008-10-08': '2008-10-14', 'DISTANCE'].plot()
plt.show()
In [20]:
# AJOUT .mean() POUR AVOIR LA MOYENNE ICI ENTRE 2007 et 2009 SUR CHAQUE MOIS
# On peut faire 2W pour avoir la moyenne sur 2 semaine
df_test2.loc['2007': '2009', 'DISTANCE'].resample('W').mean().plot()
plt.show()
In [21]:
df_test2.loc['2007': '2009', 'DISTANCE'].resample('W').mean()
Out[21]:
DATE
2007-08-05    188.665415
2007-08-12           NaN
2007-08-19           NaN
2007-08-26           NaN
2007-09-02     52.694549
                 ...    
2009-02-22           NaN
2009-03-01    135.867209
2009-03-08    136.453900
2009-03-15     41.408222
2009-03-22    132.865119
Freq: W-SUN, Name: DISTANCE, Length: 86, dtype: float64
In [22]:
v2 = df_test2.loc['2007': '2009', 'DISTANCE']
v2 = v2.dropna(axis=0)
v2.resample('Y').plot()
plt.show()
In [23]:
# Tableau qui contient TOUTES les statistiques regrouper par semaine
m = df_test2['DISTANCE'].resample('W').agg(['mean', 'min', 'max'])
m = m.dropna(axis=0)
m
Out[23]:
mean min max
DATE
2007-08-05 188.665415 7.579780 521.588151
2007-09-02 52.694549 3.740095 141.574358
2007-09-09 117.010763 0.534408 152.263180
2007-09-16 141.174913 141.151165 141.198662
2007-09-23 487.017248 0.000000 1896.601707
2007-10-14 8.474745 8.474745 8.474745
2007-10-21 452.405129 13.074198 903.058800
2007-10-28 117.826150 0.953561 357.597888
2007-11-04 172.599180 172.599180 172.599180
2007-11-18 2.386127 2.386127 2.386127
2007-12-16 0.874360 0.874360 0.874360
2007-12-30 371.297449 7.598410 1336.970124
2008-01-06 76.233231 2.616714 173.872079
2008-03-30 789.402972 40.563641 1482.945005
2008-04-06 875.530523 9.799302 1488.715699
2008-05-18 30.094051 30.094051 30.094051
2008-05-25 158.996456 158.996456 158.996456
2008-06-15 78.460888 1.804055 155.117721
2008-06-22 412.071493 64.587769 1016.879548
2008-06-29 9.678511 6.875718 12.481304
2008-08-03 1480.011872 1480.011872 1480.011872
2008-09-21 78.940948 12.630421 134.496513
2008-09-28 207.047476 11.731297 1889.460371
2008-10-05 978.507356 12.559579 2039.850203
2008-10-12 111.524805 9.099131 137.478298
2008-10-19 80.373230 5.465194 140.088022
2008-10-26 9.072080 8.392449 9.751710
2008-11-02 58.790511 12.940038 133.391211
2008-11-09 58.509264 12.889444 140.571643
2008-12-07 128.070790 8.253144 231.666171
2008-12-14 140.366129 133.256700 147.475559
2008-12-21 2929.076701 2929.076701 2929.076701
2008-12-28 113.490308 0.690497 165.621845
2009-01-04 93.817988 4.743728 143.685910
2009-01-11 149.108992 129.426850 168.791134
2009-01-18 46.443713 23.816281 89.971945
2009-01-25 137.989135 137.989135 137.989135
2009-02-01 129.980518 129.980518 129.980518
2009-02-08 138.322373 133.470686 143.174060
2009-02-15 154.286165 135.680966 172.891365
2009-03-01 135.867209 135.217493 136.516925
2009-03-08 136.453900 136.453900 136.453900
2009-03-15 41.408222 41.408222 41.408222
2009-03-22 132.865119 132.865119 132.865119
In [24]:
# On va se concentrer sur 2008 car c'est celui qui à le plus de valeurs
plt.figure(figsize=(12,8))
m['mean']['2007'].plot(label= 'moyenne en 2007', lw=2, ls='--', alpha=0.8)
m['mean']['2008'].plot(label= 'moyenne en 2008')
m['mean']['2009'].plot(label= 'moyenne en 2008', lw=3, ls=':', alpha=0.8)
#plt.fill_between(m.index, m['max'], m['min'], alpha=0.2, label='min-max par semaine')

plt.legend()
plt.show()
In [25]:
df_test2.loc['2007': '2009', 'DISTANCE'].resample('Y').mean().plot(label= 'moyenne regroupé par année', lw=2, ls='--', alpha=0.8)
df_test2.loc['2007': '2009', 'DISTANCE'].resample('M').mean().plot(label= 'moyenne regroupé par mois', lw=3, ls=':', alpha=0.8)
df_test2.loc['2007': '2009', 'DISTANCE'].resample('W').mean().plot(label= 'moyenne regroupé par semaine')
plt.legend()
plt.show()

SAISONNABILITE¶

In [26]:
analysis = df_test2[['DISTANCE']].copy()
#decompose_result_mult = seasonal_decompose(analysis, model="additive")

# trend = decompose_result_mult.trend
# seasonal = decompose_result_mult.seasonal
# residual = decompose_result_mult.resid
#
# decompose_result_mult.plot();
In [27]:
analysis.index
Out[27]:
DatetimeIndex(['2007-08-04', '2007-08-04', '2007-08-05', '2007-08-28',
               '2007-08-30', '2007-09-01', '2007-09-03', '2007-09-05',
               '2007-09-06', '2007-09-07',
               ...
               '2009-01-31', '2009-02-07', '2009-02-08', '2009-02-13',
               '2009-02-14', '2009-02-27', '2009-03-01', '2009-03-07',
               '2009-03-15', '2009-03-21'],
              dtype='datetime64[ns]', name='DATE', length=161, freq=None)
In [28]:
analysis
# apply the dtype attribute
result = analysis.dtypes

print("Output:")
print(result)
Output:
DISTANCE    float64
dtype: object

Analyse BD¶

In [29]:
uri = 'mongodb+srv://admin:uvsqawsgroupe17@cluster0.nkdni.mongodb.net/?retryWrites=true&w=majority'
myclient = pymongo.MongoClient(uri)

mydb = myclient["DonneeGPS"]
mycol = mydb["DATAGPS"]


x = mycol.find({'USER_ID': '000'})
y = mycol.find({'USER_ID': '001'})
z = mycol.find({'USER_ID': '010'})
In [30]:
# PLOT DE TOUS LES DEPLACEMENT DE L'USER
# Utilisateur 000
df_plot = pd.DataFrame.from_dict(x)
df_plot_001 = pd.DataFrame.from_dict(y)
df_plot_010 = pd.DataFrame.from_dict(z)
df_plot_010
Out[30]:
_id USER_ID DATE DEPART ARRIVE DISTANCE TEMPS TYPE-TRANSPORT DOM-TRAV
0 638e32ba429ed8f316063d75 010 2007-08-04/03:30:32 (39.921712, 116.472343) (39.920587, 116.472327) 0.15591323546084734 0:02:26 marche UNDEFINED
1 638e32ba429ed8f316063d76 010 2007-08-04/03:32:58 (39.920612, 116.47233) (39.919843, 116.472015) 0.09839271676975873 0:00:30 velo UNDEFINED
2 638e32ba429ed8f316063d77 010 2007-08-04/03:33:28 (39.919847, 116.472015) (39.919595, 116.47183) 0.041901067721451773 0:01:18 marche UNDEFINED
3 638e32ba429ed8f316063d78 010 2007-08-04/03:34:46 (39.91961, 116.471828) (39.918383, 116.472317) 0.1493635058754237 0:00:33 velo UNDEFINED
4 638e32ba429ed8f316063d79 010 2007-08-04/03:35:19 (39.918387, 116.472333) (39.916862, 116.471713) 0.23231533390805081 0:02:58 marche UNDEFINED
... ... ... ... ... ... ... ... ... ...
12993 638e32bc429ed8f316067036 010 2009-03-21/05:29:50 (39.129293, 117.21125) (39.130798, 117.209983) 0.1946596240649425 0:00:22 voiture/bus/taxi UNDEFINED
12994 638e32bc429ed8f316067037 010 2009-03-21/05:30:12 (39.13075, 117.210046) (39.135231, 117.214251) 0.7305619825803901 0:01:07 voiture/bus/taxi UNDEFINED
12995 638e32bc429ed8f316067038 010 2009-03-21/05:31:19 (39.135206, 117.214216) (39.13528, 117.214281) 0.015433977507834774 0:00:22 marche UNDEFINED
12996 638e32bc429ed8f316067039 010 2009-03-21/05:31:41 (39.13528, 117.214281) (39.135571, 117.216216) 0.17897305062830568 0:00:44 velo UNDEFINED
12997 638e32bc429ed8f31606703a 010 2009-03-21/05:32:25 (39.135546, 117.216201) (39.136256, 117.218303) 0.22277101475818747 0:02:28 marche UNDEFINED

12998 rows × 9 columns

In [31]:
df_plot["TYPE-TRANSPORT"].value_counts()
Out[31]:
marche              1175
voiture/bus/taxi     760
velo                 692
train                 38
Name: TYPE-TRANSPORT, dtype: int64
In [32]:
fig = px.bar(df_plot, x="TYPE-TRANSPORT", title='TYPE-TRANSPORT')
fig.show()
In [33]:
fig = px.bar(df_plot_001, x="TYPE-TRANSPORT", title='TYPE-TRANSPORT')
fig.show()
In [34]:
# La fonction pour le plot par mois

dates = df_plot.DATE.tolist()
months = [date[:7] for date in dates]
df_plot["MONTH"] = months
fig = px.bar(df_plot, x="MONTH",color='TYPE-TRANSPORT',barmode='group',height=400)

fig.update_traces(dict(marker_line_width=0))
fig.show()
In [35]:
# L'index devient la DATE du trajet afin de faire des stats dessus
df_plot_copy = df_plot.copy()
df_plot_copy.DATE = pd.to_datetime(df_plot_copy.DATE)
df_plot_copy.set_index('DATE', inplace=True)
In [36]:
# Tableau qui contient TOUTES les statistiques regrouper par semaine
df_plot_copy.DISTANCE = df_plot_copy.DISTANCE.astype(float)

m = df_plot_copy['DISTANCE'].resample('W').agg(['mean', 'min', 'max'])
m = m.dropna(axis=0)
m
Out[36]:
mean min max
DATE
2008-10-26 1.534044 0.092277 6.134741
2008-11-02 1.727254 0.083017 7.527443
2008-11-09 3.530503 0.424163 8.943539
2008-11-16 1.232348 0.068799 4.386517
2008-11-23 0.825494 0.023192 4.073307
2008-12-07 0.728412 0.003140 2.103240
2008-12-14 1.164513 0.034702 3.284624
2009-04-05 1.127259 0.000140 10.211117
2009-04-12 1.545150 0.000634 17.813733
2009-04-19 1.640630 0.001691 16.519590
2009-04-26 1.300816 0.018620 9.245312
2009-05-03 1.292563 0.012914 10.432872
2009-05-10 1.161325 0.032873 3.948896
2009-05-17 1.045953 0.038384 3.041003
2009-05-24 1.263743 0.008401 8.971864
2009-05-31 1.088704 0.061086 5.466519
2009-06-07 1.245453 0.036099 8.298692
2009-06-14 1.097408 0.002445 4.219826
2009-06-21 1.157959 0.012138 4.387961
2009-06-28 1.316769 0.038310 10.093630
2009-07-05 1.184727 0.000420 13.338901
In [37]:
# On va se concentrer sur 2008 car c'est celui qui à le plus de valeurs
plt.figure(figsize=(12,8))
m['mean']['2008'].plot(label= 'moyenne en 2008', lw=2, ls='--', alpha=0.8)
m['mean']['2009'].plot(label= 'moyenne en 2009', lw=3, ls=':', alpha=0.8)
#plt.fill_between(m.index, m['max'], m['min'], alpha=0.2, label='min-max par semaine')

plt.legend()
plt.show()
In [38]:
# Transport
buff_marche = (df_plot['TYPE-TRANSPORT'] == 'marche').sum()
buff_train = (df_plot['TYPE-TRANSPORT'] == 'train').sum()
buff_velo = (df_plot['TYPE-TRANSPORT'] == 'velo').sum()
buff_voiture = (df_plot['TYPE-TRANSPORT'] == 'voiture/bus/taxi').sum()
buff_avion = (df_plot['TYPE-TRANSPORT'] == 'airplane').sum()
In [39]:
df_plot
Out[39]:
_id USER_ID DATE DEPART ARRIVE DISTANCE TEMPS TYPE-TRANSPORT DOM-TRAV MONTH
0 638e32b6429ed8f31605d088 000 2008-10-23/02:53:04 (39.984702, 116.318417) (39.984045, 116.298725) 2.0723741546645145 0:12:11 velo UNDEFINED 2008-10
1 638e32b6429ed8f31605d089 000 2008-10-23/02:53:04 (39.984702, 116.318417) (39.984045, 116.298725) 2.0723741546645145 0:12:11 velo UNDEFINED 2008-10
2 638e32b6429ed8f31605d08a 000 2008-10-23/04:08:07 (39.995777, 116.286798) (39.984498, 116.299407) 2.0763510121490385 0:02:00 voiture/bus/taxi UNDEFINED 2008-10
3 638e32b6429ed8f31605d08b 000 2008-10-23/04:10:07 (39.984499, 116.299405) (39.990325, 116.310258) 1.7731990619742037 0:09:55 velo UNDEFINED 2008-10
4 638e32b6429ed8f31605d08c 000 2008-10-23/04:20:02 (39.990219, 116.310215) (39.990887, 116.310479) 0.0922767965163593 0:01:35 marche UNDEFINED 2008-10
... ... ... ... ... ... ... ... ... ... ...
2660 638e32b6429ed8f31605daec 000 2009-07-05/07:33:15 (39.984025, 116.306875) (39.984199, 116.308835) 0.24069283883115963 0:01:14 velo UNDEFINED 2009-07
2661 638e32b6429ed8f31605daed 000 2009-07-05/07:34:29 (39.984182, 116.308526) (39.991446, 116.323739) 2.0499225963154446 0:03:25 voiture/bus/taxi UNDEFINED 2009-07
2662 638e32b6429ed8f31605daee 000 2009-07-05/07:37:54 (39.991416, 116.323173) (39.996502, 116.328972) 1.0899619160797875 0:03:29 velo UNDEFINED 2009-07
2663 638e32b6429ed8f31605daef 000 2009-07-05/07:41:23 (39.996488, 116.328608) (39.999958, 116.327362) 0.5767519160190993 0:01:15 voiture/bus/taxi UNDEFINED 2009-07
2664 638e32b6429ed8f31605daf0 000 2009-07-05/07:42:38 (39.999985, 116.327424) (40.000522, 116.327132) 0.10156384139126953 0:02:37 marche UNDEFINED 2009-07

2665 rows × 10 columns

In [40]:
# Regrouper les données par semaine
# Y = Year / M = Month / W = Week / D = Day / h = Hour / m = minute
df_plot_copy.loc['2008-10': '2008-11', 'DISTANCE'].resample('W').plot()
plt.show()
In [41]:
fig = px.bar(df_plot, x='DEPART')
fig.show()
In [42]:
buff = df_plot_copy.groupby('TYPE-TRANSPORT').sum()['DISTANCE'].sort_values()
fig = px.bar(buff, x="DISTANCE", title='DISTANCE TOTAL PAR TYPE-TRANSPORT (USER 000)')
fig.show()
C:\Program Files\KMSpico\temp\ipykernel_104252\2862209076.py:1: FutureWarning:

The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

In [45]:
lat = []
long = []
count = [1 for i in range(len(df_plot))]
for depart in df_plot.DEPART.tolist():
    coord = depart.replace(")","").replace("(","").split(",")
    lat.append(float(coord[0]))
    long.append(float(coord[1]))

fig = px.density_mapbox(df_plot, lat=lat, lon=long, z=count,
                        mapbox_style="stamen-terrain")

fig
In [46]:
# Comparison USER 000 et USER 001
# inner = assemble les dataframe sur les valeurs en communs
fusion = pd.merge(df_plot, df_plot_001, on='TYPE-TRANSPORT', how='inner', suffixes=('_user000', '_user001'))
fusion
Out[46]:
_id_user000 USER_ID_user000 DATE_user000 DEPART_user000 ARRIVE_user000 DISTANCE_user000 TEMPS_user000 TYPE-TRANSPORT DOM-TRAV_user000 MONTH _id_user001 USER_ID_user001 DATE_user001 DEPART_user001 ARRIVE_user001 DISTANCE_user001 TEMPS_user001 DOM-TRAV_user001
0 638e32b6429ed8f31605d088 000 2008-10-23/02:53:04 (39.984702, 116.318417) (39.984045, 116.298725) 2.0723741546645145 0:12:11 velo UNDEFINED 2008-10 638e32b6429ed8f31605daf1 001 2008-10-23/05:53:05 (39.984094, 116.319236) (39.978051, 116.327538) 1.5427186349264368 0:08:52 UNDEFINED
1 638e32b6429ed8f31605d088 000 2008-10-23/02:53:04 (39.984702, 116.318417) (39.984045, 116.298725) 2.0723741546645145 0:12:11 velo UNDEFINED 2008-10 638e32b6429ed8f31605daf2 001 2008-10-23/05:53:05 (39.984094, 116.319236) (39.978051, 116.327538) 1.5427186349264368 0:08:52 UNDEFINED
2 638e32b6429ed8f31605d088 000 2008-10-23/02:53:04 (39.984702, 116.318417) (39.984045, 116.298725) 2.0723741546645145 0:12:11 velo UNDEFINED 2008-10 638e32b6429ed8f31605daf5 001 2008-10-23/10:35:36 (39.98028, 116.326988) (39.983215, 116.326708) 0.3188556326688519 0:00:59 UNDEFINED
3 638e32b6429ed8f31605d088 000 2008-10-23/02:53:04 (39.984702, 116.318417) (39.984045, 116.298725) 2.0723741546645145 0:12:11 velo UNDEFINED 2008-10 638e32b6429ed8f31605daf6 001 2008-10-23/10:36:35 (39.983115, 116.326693) (40.009928, 116.314928) 3.876949395296889 0:19:35 UNDEFINED
4 638e32b6429ed8f31605d088 000 2008-10-23/02:53:04 (39.984702, 116.318417) (39.984045, 116.298725) 2.0723741546645145 0:12:11 velo UNDEFINED 2008-10 638e32b6429ed8f31605daf8 001 2008-10-23/10:57:09 (40.009645, 116.312623) (40.01615, 116.307288) 1.076005654344659 0:06:04 UNDEFINED
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1409307 638e32b6429ed8f31605d9fd 000 2009-06-28/09:27:07 (31.109591, 121.077294) (31.137361, 121.129715) 5.829406390378319 0:03:20 train UNDEFINED 2009-06 638e32b6429ed8f31605dc42 001 2008-11-01/04:16:32 (40.07093, 116.299532) (40.060837, 116.295175) 1.8610431637789926 0:01:17 UNDEFINED
1409308 638e32b6429ed8f31605d9fe 000 2009-06-28/09:30:27 (31.137328, 121.128074) (31.148903, 121.232573) 10.093629729871989 0:06:40 train UNDEFINED 2009-06 638e32b6429ed8f31605dc42 001 2008-11-01/04:16:32 (40.07093, 116.299532) (40.060837, 116.295175) 1.8610431637789926 0:01:17 UNDEFINED
1409309 638e32b6429ed8f31605d9ff 000 2009-06-28/09:37:07 (31.148914, 121.231212) (31.150603, 121.25979) 2.622502561565015 0:01:40 train UNDEFINED 2009-06 638e32b6429ed8f31605dc42 001 2008-11-01/04:16:32 (40.07093, 116.299532) (40.060837, 116.295175) 1.8610431637789926 0:01:17 UNDEFINED
1409310 638e32b6429ed8f31605da00 000 2009-06-28/09:38:47 (31.150392, 121.258572) (31.154602, 121.284589) 2.384399904536095 0:01:40 train UNDEFINED 2009-06 638e32b6429ed8f31605dc42 001 2008-11-01/04:16:32 (40.07093, 116.299532) (40.060837, 116.295175) 1.8610431637789926 0:01:17 UNDEFINED
1409311 638e32b6429ed8f31605da31 000 2009-06-29/02:12:25 (31.29325, 121.438485) (31.209642, 121.373726) 13.338901129116042 0:10:00 train UNDEFINED 2009-06 638e32b6429ed8f31605dc42 001 2008-11-01/04:16:32 (40.07093, 116.299532) (40.060837, 116.295175) 1.8610431637789926 0:01:17 UNDEFINED

1409312 rows × 18 columns

In [47]:
# Changement String --> Float pour les plots
fusion.DISTANCE_user000 = fusion.DISTANCE_user000.astype(float)
fusion.DISTANCE_user001 = fusion.DISTANCE_user001.astype(float)

fusion[['DISTANCE_user000']].plot(figsize=(12, 8))
Out[47]:
<AxesSubplot:>
In [ ]: